#! -*- coding:utf-8 -*-
'''
scrab html from network
'''
import sys
import time
import uuid
import re
import codecs
import urllib2
from bs4 import BeautifulSoup


reload(sys)
sys.setdefaultencoding('utf-8')

def saveurl(url,file):
    content = urllib2.urlopen(url).read()
    print content
    man_file = codecs.open('man_data.txt', 'w','utf-8')
    man_file.write(content) #.encode("gb2312"))
    man_file.close()


def readdata(file):
    read_file=open(file,"r")
    file_content= read_file.read()
    #print file_content
    read_file.close()
    return file_content

def getYian(htmltext):
    htmltext = re.sub(r'&nbsp;|\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020', '', str(htmltext))
    findReturn = BeautifulSoup(htmltext, "html.parser")
    d  = findReturn.find('div',{'class' : 'content'})
    links = d.find_all('br')
    for hos in links:
        hos.insert_before("\n")
    #print(d)
    ret = d.get_text("").replace(" ", "")
    ret.strip()
    man_file = codecs.open('tmp_data.txt', 'w', 'utf-8')
    man_file.write(ret)  # .encode("gb2312"))
    man_file.close()
    #body=str(d.find_all('p'))
    #ret={'author':d.p.string,'body':body}
    return ret


url='http://www.tcm100.com/user/hhyian/zzbook14.htm'
file="man_data.txt"
#saveurl(url,file)
content=readdata(file)
ds = getYian(content)
print 'ds',ds


time.sleep( 1 )
print str(uuid.uuid1()).replace("-","")

